Wstęp

Poniżej podjęto próbę analizy danych z WDI oraz predykcji ceny złota na jej podstawię. Z analizy wynika, że wiele z najważniejszych atrybutów które znalazł regresor jest nietypowa np. Czas potrzebny do wybudowania magazynu.

Wykorzystane biblioteki:

library("knitr")
library("DT")
library("readxl") # Reading excel files
library("tidyr")
library("dplyr")
library("zoo") # NA interpolation
library("corrr")
library("ggplot2")
library("plotly")
library("ggcorrplot")
library("caret")

Zapewnienie powtarzalności:

set.seed(23)

Wczytywanie danych

WDI <- read_excel("Data_pack/World_Development_Indicators.xlsx", sheet = 1, na="..")

currencyEx <- read.table("Data_pack/CurrencyExchangeRates.csv", header=TRUE, sep = ",")

Czyszczenie danych:

1. Usunięcie ostatnich 5 linii (puste/metadane).

2. Usunięcie kolumny z kodem kraju, ponieważ jest jego nazwa oraz nazwy serii, ponieważ jest jej kod.

3. Zamiana kolumn lat na 1 kolumne z rokiem.

4. Zamiana jednej kolumny nazwy serii na różne kolumny z danymi.

5. Zamiana formatu lat na 4 cyfry.

6. Interpolacja oraz ekstrapolacja wszystkich możliwych danych w miejce braku danych, aby zapewnić jak najwięcej danych do późniejszej analizy.

7. Usuwanie serii/krajów dla których posiadamy najmniej danych procentowo, aż uzyskamy zbiór bez brakujących wartości.

WDI <- WDI %>% slice_head(n = 44304) %>% 
  select(-c("Country Code", "Series Name")) %>% # 2
  pivot_longer(3:53, names_to="Year") %>% # 3
  pivot_wider(names_from = `Series Code`, values_from = value) %>% # 4
  mutate(Year = substr(Year, 1, 4))
# 6. Interpolating and extrapolating all possible values
WDI <- WDI %>%
  group_by(`Country Name`) %>%
  mutate_at(3:215, na.approx, na.rm=FALSE, rule = 2) %>%
  ungroup()

WDI <- WDI %>% arrange(`Country Name`, Year)


# 7. Removing measures/countries with most % of Na's until we are left with no Na's 
while(TRUE){
  summirize_NAs <- WDI %>%
    select(-2) %>%
    group_by(`Country Name`) %>%
    summarise_all(funs(sum(is.na(.)))) %>%
    ungroup();
  row_NAs <- summirize_NAs %>%
    mutate(sum = rowSums(across(where(is.numeric)))/(ncol(summirize_NAs)-1)/51);
  col_NAs <- row_NAs %>%
    summarise(across(-1, ~ sum(., is.na(.), 0)/nrow(row_NAs)/51));
  
  max_row <- max(row_NAs %>% select(sum));
  max_row_index <- which(row_NAs %>% select(sum)==max_row);
  
  max_col <- max(col_NAs);
  max_col_index <- which(col_NAs==max_col);
  if(max_col==max_row && max_col==0) break;
  if(max_col > max_row) { WDI <- WDI %>% select(-(max_col_index+2));
  } else WDI <- WDI %>% slice(-((51*(max_row_index-1)+1):(51*max_row_index)));
}

Rozmiar zbioru i statystyki wartości

Po uzyskaniu zbioru bez wartości pustych pozostało 122 z początkowych 213 serii (57.3%) oraz 141 z 208 początkowych krajów (67.8%) z danymi ekstrapolowanymi do wszystkich 51 lat.

dim(WDI)
## [1] 7191  124
kable(summary.data.frame(WDI))
Country Name Year SP.URB.GROW SP.URB.TOTL.IN.ZS SP.URB.TOTL TX.VAL.TRAN.ZS.WT TM.VAL.TRAN.ZS.WT BG.GSR.NFSV.GD.ZS NE.TRD.GNFS.ZS NY.GDP.TOTL.RT.ZS EN.ATM.GHGT.KT.CE SH.ALC.PCAP.LI IC.WRH.DURS IC.LGL.DURS IC.ELC.TIME IC.TAX.PAYM SP.DYN.TO65.FE.ZS SP.DYN.TO65.MA.ZS SH.STA.SUIC.P5 SH.STA.SUIC.FE.P5 SH.STA.SUIC.MA.P5 IC.LGL.CRED.XQ BM.GSR.NFSV.CD BX.GSR.NFSV.CD SL.EMP.SELF.MA.ZS SL.EMP.SELF.ZS SL.EMP.SELF.FE.ZS IT.NET.SECR IT.NET.SECR.P6 SE.SEC.TCHR SE.SEC.ENRL IP.JRN.ARTC.SC SP.RUR.TOTL.ZG SP.RUR.TOTL.ZS SP.RUR.TOTL EG.FEC.RNEW.ZS ER.H2O.INTR.PC ER.H2O.INTR.K3 EG.ELC.RNEW.ZS SE.PRM.ENRL.TC.ZS SG.GEN.PARL.ZS BM.GSR.FCTY.CD BX.GSR.FCTY.CD SE.PRM.AGES BN.KLT.PTXL.CD BX.PEF.TOTL.CD.WD SP.POP.TOTL SP.POP.TOTL.MA.IN SP.POP.TOTL.MA.ZS SP.POP.TOTL.FE.ZS SP.POP.TOTL.FE.IN SP.POP.GROW EN.POP.DNST SP.POP.65UP.TO.ZS SP.POP.1564.TO.ZS SP.POP.0014.TO.ZS EN.ATM.PM25.MC.M3 EN.ATM.PM25.MC.ZS EN.ATM.PM25.MC.T1.ZS EN.ATM.PM25.MC.T2.ZS EN.ATM.PM25.MC.T3.ZS SH.DTH.MORT EN.ATM.NOXE.ZG EN.ATM.NOXE.KT.CE EN.ATM.NOXE.EG.ZS NY.GSR.NFCY.CD NY.GSR.NFCY.CN BN.GSR.FCTY.CD FM.AST.DOMS.CN NY.GDP.NGAS.RT.ZS SP.DYN.IMRT.IN SH.STA.TRAF.P5 EN.ATM.METH.ZG EN.ATM.METH.KT.CE EN.ATM.METH.EG.KT.CE TX.VAL.MRCH.HI.ZS SP.DYN.LE00.IN AG.LND.TOTL.K2 SL.TLF.TOTL.IN ST.INT.XPND.CD SM.POP.TOTL.ZS IT.NET.USER.ZS NE.IMP.GNFS.CD NE.IMP.GNFS.ZS NY.GNS.ICTR.ZS NE.DAB.TOTL.ZS NE.DAB.TOTL.CD NY.GNS.ICTR.CD NY.GDS.TOTL.ZS NY.GDS.TOTL.CD BX.GSR.MRCH.CD BM.GSR.MRCH.CD NY.GDP.PCAP.CD NY.GDP.PCAP.KD.ZG NY.GDP.MKTP.KD.ZG NY.GDP.MKTP.CD TX.VAL.FUEL.ZS.UN TM.VAL.FUEL.ZS.UN TX.VAL.FOOD.ZS.UN TM.VAL.FOOD.ZS.UN NE.EXP.GNFS.CD SL.IND.EMPL.ZS SL.SRV.EMPL.ZS SL.AGR.EMPL.ZS SL.EMP.MPYR.ZS IC.BUS.DFRN.XQ SH.STA.DIAB.ZS SH.XPD.CHEX.PC.CD SH.XPD.CHEX.GD.ZS EN.ATM.CO2E.SF.ZS EN.ATM.CO2E.SF.KT EN.ATM.CO2E.LF.KT EN.ATM.CO2E.LF.ZS EN.ATM.CO2E.GF.KT EN.ATM.CO2E.GF.ZS EN.ATM.CO2E.PC EN.ATM.CO2E.KT EN.ATM.CO2E.PP.GD EN.ATM.CO2E.PP.GD.KD EN.ATM.CO2E.KD.GD SP.DYN.CBRT.IN AG.LND.PRCP.MM FB.ATM.TOTL.P5 EG.ELC.ACCS.ZS
Length:7191 Length:7191 Min. :-187.142 Min. : 2.845 Min. : 10522 Min. :-381.37 Min. : 0.2916 Min. : 1.165 Min. : 0.021 Min. : 0.0000 Min. : 21 Min. : 0.003 Min. : 27.0 Min. : 210.0 Min. : 13.0 Min. : 3.00 Min. : 6.464 Min. : 1.477 Min. : 0.30 Min. : 0.000 Min. : 0.40 Min. : 0.000 Min. :1.000e+06 Min. :0.000e+00 Min. : 0.39 Min. : 0.41 Min. : 0.07 Min. : 0 Min. : 0.00 Min. : 0 Min. : 0 Min. : 0.0 Min. :-235.79245 Min. : 0.00 Min. : 0 Min. : 0.000 Min. : 0 Min. : 0.0 Min. : 0.000 Min. : 8.141 Min. : 0.000 Min. :-2.187e+08 Min. :-5.061e+07 Min. :4.000 Min. :-8.080e+11 Min. :-2.441e+11 Min. :8.537e+04 Min. : 45189 Min. :44.37 Min. :23.29 Min. : 39622 Min. :-6.7661 Min. : 0.8232 Min. : 0.7146 Min. :46.70 Min. :12.45 Min. : 5.861 Min. : 0.00 Min. : 0.0000 Min. : 0.00000 Min. : 0.00 Min. : 17 Min. :-84.585 Min. : 0 Min. : 0.000 Min. :-9.905e+10 Min. :-4.813e+14 Min. :-1.052e+11 Min. :-5.424e+13 Min. : 0.000000 Min. : 1.70 Min. : 0.7 Min. : -78.979 Min. : 13.1 Min. : 0 Min. : 0.2453 Min. :18.91 Min. : 300 Min. : 57406 Min. :3.500e+06 Min. : 0.0326 Min. : 0.0000 Min. :0.000e+00 Min. : 0.00 Min. :-236.27 Min. : 21.21 Min. :5.393e+07 Min. :-2.601e+10 Min. :-141.97 Min. :-7.622e+09 Min. :5.000e+06 Min. :5.573e+06 Min. : 22.8 Min. :-64.9924 Min. :-64.047 Min. :4.061e+07 Min. : 0.0000 Min. : 0.00933 Min. : 0.00058 Min. : 0.4738 Min. :6.933e+05 Min. : 0.28 Min. : 5.34 Min. : 0.06 Min. : 0.000 Min. :31.95 Min. : 1.000 Min. : 4.335 Min. : 1.453 Min. : -4.324 Min. : -114 Min. : -161.3 Min. : -6.089 Min. : 0.0 Min. : 0.000 Min. : 0.00433 Min. : 4 Min. :0.01479 Min. :0.01041 Min. :0.02151 Min. : 5.90 Min. : 18.1 Min. : 0.000 Min. : 0.5339
Class :character Class :character 1st Qu.: 1.117 1st Qu.: 33.870 1st Qu.: 1190394 1st Qu.: 13.71 1st Qu.:27.9935 1st Qu.: 8.929 1st Qu.: 46.374 1st Qu.: 0.5579 1st Qu.: 11500 1st Qu.: 2.590 1st Qu.:142.0 1st Qu.: 447.0 1st Qu.: 66.0 1st Qu.: 12.00 1st Qu.:59.478 1st Qu.:50.834 1st Qu.: 5.00 1st Qu.: 2.500 1st Qu.: 7.10 1st Qu.: 2.000 1st Qu.:2.949e+08 1st Qu.:1.846e+08 1st Qu.:19.82 1st Qu.:17.71 1st Qu.:14.21 1st Qu.: 13 1st Qu.: 1.46 1st Qu.: 6703 1st Qu.: 137241 1st Qu.: 28.9 1st Qu.: -0.40820 1st Qu.:29.69 1st Qu.: 1035996 1st Qu.: 6.679 1st Qu.: 1424 1st Qu.: 11.5 1st Qu.: 3.175 1st Qu.: 17.606 1st Qu.: 6.091 1st Qu.: 7.935e+07 1st Qu.: 1.999e+07 1st Qu.:6.000 1st Qu.:-5.821e+07 1st Qu.: 0.000e+00 1st Qu.:2.936e+06 1st Qu.: 1457900 1st Qu.:48.83 1st Qu.:49.82 1st Qu.: 1494832 1st Qu.: 0.6204 1st Qu.: 18.8641 1st Qu.: 3.2608 1st Qu.:53.38 1st Qu.:22.68 1st Qu.: 16.469 1st Qu.: 99.94 1st Qu.: 0.0000 1st Qu.: 0.00828 1st Qu.: 67.93 1st Qu.: 1078 1st Qu.:-16.160 1st Qu.: 1299 1st Qu.: 3.022 1st Qu.:-1.106e+09 1st Qu.:-1.423e+10 1st Qu.:-1.236e+09 1st Qu.: 3.969e+09 1st Qu.: 0.000000 1st Qu.: 12.40 1st Qu.:12.4 1st Qu.: -6.591 1st Qu.: 3250.6 1st Qu.: 400 1st Qu.: 55.7356 1st Qu.:58.32 1st Qu.: 48100 1st Qu.: 1305780 1st Qu.:9.600e+07 1st Qu.: 0.9909 1st Qu.: 0.0000 1st Qu.:1.260e+09 1st Qu.: 25.08 1st Qu.: 13.54 1st Qu.: 97.86 1st Qu.:3.733e+09 1st Qu.: 5.670e+08 1st Qu.: 10.89 1st Qu.: 3.471e+08 1st Qu.:6.449e+08 1st Qu.:9.854e+08 1st Qu.: 615.2 1st Qu.: -0.6905 1st Qu.: 1.033 1st Qu.:3.286e+09 1st Qu.: 0.5267 1st Qu.: 7.32520 1st Qu.: 7.09749 1st Qu.: 8.8995 1st Qu.:9.338e+08 1st Qu.:13.42 1st Qu.:34.86 1st Qu.: 9.48 1st Qu.: 1.040 1st Qu.:52.87 1st Qu.: 4.500 1st Qu.: 32.842 1st Qu.: 4.188 1st Qu.: 0.000 1st Qu.: 0 1st Qu.: 1694.2 1st Qu.: 40.872 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.54559 1st Qu.: 2251 1st Qu.:0.16043 1st Qu.:0.12069 1st Qu.:0.24574 1st Qu.:14.73 1st Qu.: 593.0 1st Qu.: 3.042 1st Qu.: 40.8004
Mode :character Mode :character Median : 2.447 Median : 52.589 Median : 3542777 Median : 24.82 Median :41.8922 Median : 13.966 Median : 66.165 Median : 2.5902 Median : 42440 Median : 5.842 Median :186.0 Median : 564.0 Median : 95.0 Median : 31.00 Median :77.064 Median :63.624 Median : 8.80 Median : 4.300 Median : 12.90 Median : 5.000 Median :1.169e+09 Median :1.049e+09 Median :38.53 Median :38.57 Median :40.40 Median : 115 Median : 13.28 Median : 32950 Median : 474281 Median : 288.0 Median : 0.67063 Median :47.41 Median : 3303149 Median :26.788 Median : 3914 Median : 49.0 Median : 20.917 Median : 24.983 Median :11.111 Median : 5.060e+08 Median : 1.664e+08 Median :6.000 Median : 0.000e+00 Median : 0.000e+00 Median :7.838e+06 Median : 3876199 Median :49.55 Median :50.45 Median : 3984469 Median : 1.6179 Median : 55.3736 Median : 4.6870 Median :60.62 Median :34.07 Median : 24.489 Median :100.00 Median : 0.4832 Median : 41.96430 Median : 99.90 Median : 6999 Median : -1.934 Median : 4770 Median : 6.397 Median :-1.470e+08 Median :-5.586e+08 Median :-1.570e+08 Median : 6.594e+10 Median : 0.000057 Median : 31.10 Median :17.2 Median : 1.136 Median : 9410.0 Median : 1300 Median : 72.7388 Median :68.79 Median : 199810 Median : 3568471 Median :4.220e+08 Median : 3.1633 Median : 0.1259 Median :5.679e+09 Median : 34.86 Median : 20.56 Median :103.06 Median :1.620e+10 Median : 3.494e+09 Median : 20.09 Median : 2.869e+09 Median :3.482e+09 Median :4.856e+09 Median : 1917.2 Median : 1.9469 Median : 3.525 Median :1.419e+10 Median : 3.0458 Median :12.34879 Median : 16.15486 Median :13.1699 Median :4.605e+09 Median :20.82 Median :48.13 Median :26.61 Median : 2.800 Median :62.11 Median : 6.400 Median : 131.969 Median : 5.503 Median : 5.405 Median : 554 Median : 6703.3 Median : 64.271 Median : 579.4 Median : 3.089 Median : 2.26642 Median : 13800 Median :0.28513 Median :0.20237 Median :0.38302 Median :24.45 Median :1026.0 Median : 18.822 Median : 96.9309
NA NA Mean : 2.721 Mean : 51.925 Mean : 17795487 Mean : 28.64 Mean :42.3491 Mean : 20.201 Mean : 75.522 Mean : 7.0120 Mean : 238871 Mean : 6.517 Mean :206.1 Mean : 645.8 Mean :118.5 Mean : 31.61 Mean :71.468 Mean :61.648 Mean :11.48 Mean : 5.527 Mean : 17.72 Mean : 4.808 Mean :1.355e+10 Mean :1.364e+10 Mean :43.26 Mean :44.39 Mean :45.70 Mean : 42934 Mean : 1093.22 Mean : 156892 Mean : 2706222 Mean : 9687.4 Mean : -0.08267 Mean :48.08 Mean : 20129583 Mean :36.429 Mean : 22145 Mean : 290.7 Mean : 35.380 Mean : 27.943 Mean :13.631 Mean : 1.251e+10 Mean : 1.204e+10 Mean :6.235 Mean :-2.166e+09 Mean : 2.274e+09 Mean :3.793e+07 Mean : 19093379 Mean :49.70 Mean :50.30 Mean : 18831691 Mean : 1.6401 Mean : 110.1329 Mean : 7.1296 Mean :59.94 Mean :32.93 Mean : 29.004 Mean : 93.35 Mean : 25.3769 Mean : 47.63829 Mean : 77.59 Mean : 70685 Mean : 3.346 Mean : 17539 Mean : 9.081 Mean :-3.211e+08 Mean :-1.635e+12 Mean :-4.732e+08 Mean : 3.815e+13 Mean : 0.280302 Mean : 45.74 Mean :18.8 Mean : 9.857 Mean : 44625.0 Mean : 14990 Mean : 67.4559 Mean :65.80 Mean : 846340 Mean : 18061107 Mean :4.431e+09 Mean : 6.9947 Mean :14.3044 Mean :5.838e+10 Mean : 40.42 Mean : 20.40 Mean :104.99 Mean :2.452e+11 Mean : 6.647e+10 Mean : 18.98 Mean : 6.225e+10 Mean :4.836e+10 Mean :4.726e+10 Mean : 7365.6 Mean : 1.3644 Mean : 3.076 Mean :2.451e+11 Mean : 14.7042 Mean :14.25896 Mean : 25.06494 Mean :14.4835 Mean :5.899e+10 Mean :20.44 Mean :47.70 Mean :31.86 Mean : 3.121 Mean :61.98 Mean : 6.713 Mean : 670.953 Mean : 5.877 Mean : 17.592 Mean : 67918 Mean : 61102.1 Mean : 63.522 Mean : 30666.6 Mean : 12.916 Mean : 4.52634 Mean : 165030 Mean :0.40723 Mean :0.27200 Mean :0.61531 Mean :26.67 Mean :1147.3 Mean : 34.736 Mean : 72.9918
NA NA 3rd Qu.: 4.054 3rd Qu.: 70.314 3rd Qu.: 12296820 3rd Qu.: 39.86 3rd Qu.:55.3577 3rd Qu.: 22.394 3rd Qu.: 95.561 3rd Qu.: 8.9319 3rd Qu.: 134735 3rd Qu.:10.100 3rd Qu.:252.0 3rd Qu.: 785.0 3rd Qu.:148.0 3rd Qu.: 43.00 3rd Qu.:84.780 3rd Qu.:74.000 3rd Qu.:14.50 3rd Qu.: 7.600 3rd Qu.: 21.70 3rd Qu.: 6.000 3rd Qu.:6.453e+09 3rd Qu.:6.239e+09 3rd Qu.:66.22 3rd Qu.:69.81 3rd Qu.:79.33 3rd Qu.: 1717 3rd Qu.: 187.86 3rd Qu.: 106576 3rd Qu.: 1827545 3rd Qu.: 4386.0 3rd Qu.: 1.81238 3rd Qu.:66.13 3rd Qu.: 10892256 3rd Qu.:65.414 3rd Qu.: 20576 3rd Qu.: 182.5 3rd Qu.: 63.263 3rd Qu.: 35.319 3rd Qu.:19.149 3rd Qu.: 4.571e+09 3rd Qu.: 1.857e+09 3rd Qu.:7.000 3rd Qu.: 1.340e+07 3rd Qu.: 9.186e+06 3rd Qu.:2.580e+07 3rd Qu.: 12820131 3rd Qu.:50.18 3rd Qu.:51.17 3rd Qu.: 12878546 3rd Qu.: 2.6058 3rd Qu.: 112.8378 3rd Qu.:10.7652 3rd Qu.:65.99 3rd Qu.:43.30 3rd Qu.: 35.835 3rd Qu.:100.00 3rd Qu.: 46.7282 3rd Qu.: 99.39609 3rd Qu.:100.00 3rd Qu.: 41540 3rd Qu.: 8.927 3rd Qu.: 14654 3rd Qu.: 10.342 3rd Qu.:-2.819e+06 3rd Qu.:-4.750e+03 3rd Qu.:-1.079e+07 3rd Qu.: 7.449e+11 3rd Qu.: 0.080898 3rd Qu.: 69.50 3rd Qu.:25.8 3rd Qu.: 13.678 3rd Qu.: 32265.0 3rd Qu.: 6460 3rd Qu.: 84.2482 3rd Qu.:74.00 3rd Qu.: 653540 3rd Qu.: 11985666 3rd Qu.:3.319e+09 3rd Qu.: 8.9348 3rd Qu.:15.2777 3rd Qu.:2.723e+10 3rd Qu.: 50.60 3rd Qu.: 27.38 3rd Qu.:110.03 3rd Qu.:9.050e+10 3rd Qu.: 2.759e+10 3rd Qu.: 28.12 3rd Qu.: 2.382e+10 3rd Qu.:2.370e+10 3rd Qu.:2.275e+10 3rd Qu.: 6956.9 3rd Qu.: 4.2256 3rd Qu.: 5.845 3rd Qu.:9.269e+10 3rd Qu.: 13.3106 3rd Qu.:19.05664 3rd Qu.: 35.51997 3rd Qu.:18.3424 3rd Qu.:2.731e+10 3rd Qu.:26.79 3rd Qu.:62.27 3rd Qu.:48.36 3rd Qu.: 4.450 3rd Qu.:72.43 3rd Qu.: 8.300 3rd Qu.: 602.116 3rd Qu.: 7.470 3rd Qu.: 27.662 3rd Qu.: 12526 3rd Qu.: 33444.9 3rd Qu.: 89.747 3rd Qu.: 12884.0 3rd Qu.: 21.451 3rd Qu.: 6.71555 3rd Qu.: 74630 3rd Qu.:0.46803 3rd Qu.:0.32905 3rd Qu.:0.67852 3rd Qu.:38.20 3rd Qu.:1622.0 3rd Qu.: 50.471 3rd Qu.:100.0000
NA NA Max. : 48.936 Max. :100.000 Max. :861289359 Max. : 100.00 Max. :98.4674 Max. :304.276 Max. :408.362 Max. :87.5075 Max. :12355240 Max. :19.950 Max. :714.0 Max. :1785.0 Max. :602.0 Max. :147.00 Max. :95.174 Max. :92.978 Max. :92.60 Max. :39.500 Max. :147.80 Max. :12.000 Max. :5.884e+11 Max. :8.758e+11 Max. :92.78 Max. :95.10 Max. :99.38 Max. :46678110 Max. :277330.58 Max. :6550540 Max. :132161360 Max. :528263.2 Max. : 29.62834 Max. :97.16 Max. :898024053 Max. :98.343 Max. :333819 Max. :5661.0 Max. :100.000 Max. :100.236 Max. :63.750 Max. : 8.993e+11 Max. : 1.136e+12 Max. :8.000 Max. : 2.827e+11 Max. : 7.257e+11 Max. :1.402e+09 Max. :719160951 Max. :76.71 Max. :55.63 Max. :682951049 Max. :17.5122 Max. :1801.8067 Max. :28.3973 Max. :86.40 Max. :50.76 Max. :100.784 Max. :100.00 Max. :100.0000 Max. :100.00000 Max. :100.00 Max. :4471008 Max. :586.284 Max. :546990 Max. :192.227 Max. : 2.923e+11 Max. : 1.051e+14 Max. : 2.578e+11 Max. : 1.021e+16 Max. :22.413461 Max. :190.10 Max. :64.6 Max. :1046.901 Max. :1242150.0 Max. :746030 Max. :100.0000 Max. :84.36 Max. :16389950 Max. :787183156 Max. :1.824e+11 Max. :82.4923 Max. :99.6529 Max. :3.138e+12 Max. :236.39 Max. : 87.10 Max. :261.43 Max. :2.204e+13 Max. : 6.257e+12 Max. : 88.39 Max. : 6.281e+12 Max. :2.497e+12 Max. :2.557e+12 Max. :118823.6 Max. : 92.2018 Max. : 88.958 Max. :2.143e+13 Max. :359.2561 Max. :94.05709 Max. :136.10844 Max. :62.4160 Max. :2.723e+12 Max. :59.58 Max. :88.51 Max. :92.37 Max. :17.880 Max. :87.17 Max. :22.000 Max. :10623.850 Max. :20.413 Max. :216.648 Max. :7499587 Max. :2494601.4 Max. :258.524 Max. :1498556.2 Max. :167.418 Max. :87.68964 Max. :10313460 Max. :2.52446 Max. :1.96865 Max. :5.35097 Max. :56.95 Max. :3240.0 Max. :288.585 Max. :100.0000

Korelacja między seriami

WDI %>% select(-(1:2)) %>% correlate()
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
ggcorrplot(cor(WDI %>% select(-(1:2))), tl.cex = 5, hc.order = TRUE)

Predykcja ceny złota:

1. Uśredniamy cene złota w kolejnych latach, usuwamy lata 1968, 1969 i 2021 o których nie mamy danych. Ceny zostawiamy tylko w dolarach amerykańskich i zmieniamy nazwę jej kolumny na Price. Pokazujemy interaktywny wykres zeleżności ceny złota od czasu.

2. Uśredniamy wszystkie dane krajów w kolejnych latach i łaczymy tabele z WDI z tabelą cen złota.

3. Usuwamy kolumnę z rokiem.

4. TWorzymy zbiór treningowy i testowy.

5. Ustawiamy metodę trenowania na repeatedCV 5, powtarzane 5 razy.

6. Trenujemy model Algorytmem random forest (ranger)

7. Testujemy model na zbiorze tesowym i obliczmy RMSE.

# 1
gold_prices <- read.csv("Data_pack/Gold prices.csv", sep=',')
gold_prices <- gold_prices %>%
  rename(Price = `USD..AM.`) %>% 
  mutate(Year = substr(Date, 1, 4)) %>% 
  select(-1) %>% 
  group_by(Year) %>% 
  summarise(across(1, ~ mean(., na.rm = TRUE))) %>% 
  slice(-c(1,2,54))

# 2
Gold_prediction <- WDI %>% select(-1) %>% group_by(Year) %>% summarise(across(-1, ~ mean(.)))

# 3
Gold_prediction <- Gold_prediction %>% inner_join(gold_prices, by='Year') 
p <- Gold_prediction %>%
     ggplot( aes(Year, Price)) +
     geom_point() +
     theme_bw()
ggplotly(p)
trainIndx <- createDataPartition(Gold_prediction$Price,
                                 p = 4/5, 
                                 list = FALSE)
 #Do not use Year for the model
Gold_prediction_filtered <- Gold_prediction %>% select(-Year)
training <- Gold_prediction_filtered[ trainIndx, ]
testing  <- Gold_prediction_filtered[-trainIndx, ]

fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 5,
  verboseIter = FALSE,
  returnResamp = "all")

set.seed(12)
rrfFit <- train(Price ~ ., 
                 data = training,
                 method = 'ranger',
                 tuneLength = 10, 
                 trControl = fitControl,
                 num.trees = 700,
                 importance = "permutation")


trellis.par.set(caretTheme())
plot(rrfFit)

testPred <- predict(rrfFit , testing)

RMSE uzyskany przez model to 170.8670375

Analiza ważności atrybutów:

10 najważniejszych atrybutów wraz z opisami:

WDI_metadata <- read_excel("Data_pack/World_Development_Indicators.xlsx", sheet = 2, na="..")
imp <- varImp(rrfFit)$importance %>% arrange(-Overall) %>% slice_head(n = 10)
imp
WDI_metadata %>% select(c("Indicator Name", "Long definition")) %>% filter(WDI_metadata$Code %in% rownames(imp))